Load libraries

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.4     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(leaflet)
library(sf)
## Warning: package 'sf' was built under R version 4.1.2
## Linking to GEOS 3.9.1, GDAL 3.4.0, PROJ 8.1.1; sf_use_s2() is TRUE
library(raster)
## Warning: package 'raster' was built under R version 4.1.2
## Loading required package: sp
## 
## Attaching package: 'raster'
## The following object is masked from 'package:dplyr':
## 
##     select
library(sp)
library(diffdf)
library(ggplot2)

Loading and cleaning data

let’s load the 2019 data

file_name <- "../data/bluebikes_tripdata_2019.csv"
# read
data_2019 <- read_csv(file_name, progress = FALSE)
## Rows: 2522771 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (3): start station name, end station name, usertype
## dbl  (12): tripduration, start station id, start station latitude, start sta...
## dttm  (2): starttime, stoptime
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_2019 <- data_2019 %>% rename("start_long" = "start station longitude",
                                      "start_lat" = "start station latitude",
                                      "end_long" = "end station longitude",
                                      "end_lat" = "end station latitude",
                                      "start_station_id" = "start station id",
                                      "start_station_name" = "start station name",
                                      "end_station_name" = "end station name",
                                      "birth_year" = "birth year")

Clean the 2019 data

# select the columns I need
cleaned_2019 <- data_2019 %>% dplyr::select(c(tripduration, starttime, stoptime, start_lat, start_long, start_station_name, end_lat, end_long, end_station_name, usertype, year, month))

cleaned_2019
## # A tibble: 2,522,771 × 12
##    tripduration starttime           stoptime            start_lat start_long
##           <dbl> <dttm>              <dttm>                  <dbl>      <dbl>
##  1          790 2019-12-01 00:01:25 2019-12-01 00:14:35      42.4      -71.1
##  2          166 2019-12-01 00:05:42 2019-12-01 00:08:29      42.4      -71.1
##  3          323 2019-12-01 00:08:28 2019-12-01 00:13:52      42.4      -71.1
##  4          709 2019-12-01 00:08:38 2019-12-01 00:20:27      42.4      -71.1
##  5          332 2019-12-01 00:10:08 2019-12-01 00:15:41      42.4      -71.1
##  6          507 2019-12-01 00:14:26 2019-12-01 00:22:53      42.4      -71.1
##  7          794 2019-12-01 00:17:46 2019-12-01 00:31:01      42.4      -71.1
##  8         1354 2019-12-01 00:18:19 2019-12-01 00:40:54      42.4      -71.1
##  9         1227 2019-12-01 00:19:01 2019-12-01 00:39:29      42.3      -71.1
## 10         1068 2019-12-01 00:24:41 2019-12-01 00:42:29      42.3      -71.1
## # … with 2,522,761 more rows, and 7 more variables: start_station_name <chr>,
## #   end_lat <dbl>, end_long <dbl>, end_station_name <chr>, usertype <chr>,
## #   year <dbl>, month <dbl>

Select the unique entries of start stations

unique_start <- cleaned_2019 %>% dplyr::select(c(start_long, start_lat, start_station_name)) %>% unique()
# there was a single outlier, so I remove it
unique_start <- unique_start[!(unique_start$start_long == 0.0000|unique_start$start_lat == 0.0000), ]

# other outliers
unique_start <- unique_start %>% filter(start_station_name != "BCBS Hingham")
unique_start <- unique_start %>% filter(start_station_name != "Main St at Beacon St")

I do the same for the end stations

unique_end <- cleaned_2019 %>% dplyr::select(c(end_long, end_lat, end_station_name)) %>% unique()
# I do the same for the end stations
unique_end <- unique_end[!(unique_end$end_long == 0.0000|unique_end$end_lat == 0.000), ]
# another outlier
unique_end <- unique_end %>% filter(end_station_name != "BCBS Hingham")

Let’s do the same for the 2020 data

file_name <- "../data/bluebikes_tripdata_2020.csv"
# read
data_2020 <- read_csv(file_name, progress = FALSE)
## Rows: 1999446 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (4): start station name, end station name, usertype, postal code
## dbl  (12): tripduration, start station id, start station latitude, start sta...
## dttm  (2): starttime, stoptime
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_2020 <- data_2020 %>% rename("start_long" = "start station longitude",
                                      "start_lat" = "start station latitude",
                                      "end_long" = "end station longitude",
                                      "end_lat" = "end station latitude",
                                      "start_station_id" = "start station id",
                                      "start_station_name" = "start station name",
                                      "end_station_name" = "end station name",
                                      "birth_year" = "birth year")

Let’s clean the 2020 data

cleaned_2020 <- data_2020 %>% dplyr::select(c(tripduration, starttime, stoptime, start_lat, start_long, start_station_name, end_lat, end_long, end_station_name, usertype, year, month))

cleaned_2020
## # A tibble: 1,999,446 × 12
##    tripduration starttime           stoptime            start_lat start_long
##           <dbl> <dttm>              <dttm>                  <dbl>      <dbl>
##  1         1793 2020-11-01 00:00:18 2020-11-01 00:30:12      42.3      -71.0
##  2         1832 2020-11-01 00:00:34 2020-11-01 00:31:07      42.3      -71.0
##  3          262 2020-11-01 00:01:54 2020-11-01 00:06:17      42.3      -71.0
##  4          419 2020-11-01 00:04:00 2020-11-01 00:10:59      42.4      -71.1
##  5          275 2020-11-01 00:04:01 2020-11-01 00:08:36      42.4      -71.1
##  6          684 2020-11-01 00:04:39 2020-11-01 00:16:03      42.3      -71.1
##  7          608 2020-11-01 00:04:43 2020-11-01 00:14:52      42.4      -71.1
##  8          438 2020-11-01 00:05:57 2020-11-01 00:13:15      42.4      -71.1
##  9          541 2020-11-01 00:07:15 2020-11-01 00:16:16      42.4      -71.1
## 10         1138 2020-11-01 00:07:17 2020-11-01 00:26:16      42.4      -71.1
## # … with 1,999,436 more rows, and 7 more variables: start_station_name <chr>,
## #   end_lat <dbl>, end_long <dbl>, end_station_name <chr>, usertype <chr>,
## #   year <dbl>, month <dbl>
unique_start_2020 <- cleaned_2020 %>% dplyr::select(c(start_long, start_lat, start_station_name)) %>% unique()
# there was a single outlier, so I remove it
unique_start_2020 <- unique_start_2020[!(unique_start_2020$start_long == 0.0000|unique_start_2020$start_lat == 0.0000), ]

# another outlier
unique_start_2020 <- unique_start_2020 %>% filter(start_station_name != "BCBS Hingham")

unique_end_2020 <- cleaned_2020 %>% dplyr::select(c(end_long, end_lat, end_station_name)) %>% unique()

unique_end_2020 <- unique_end_2020[!(unique_end_2020$end_long == 0.0000|unique_end_2020$end_lat == 0.0000), ]

unique_end_2020 <- unique_end_2020 %>% filter(end_station_name != "BCBS Hingham")

Routes

Top routes in 2019

# start stations 2019
unique_start %>% dplyr::select(start_station_name) %>% unique() %>%  count() # 359
## # A tibble: 1 × 1
##       n
##   <int>
## 1   359
# get the station names
stations_2019 <- table(data_2019$start_station_name)
# convert to tibble
stations_2019_df <- as.data.frame(stations_2019) %>% as_tibble()
# get the indices in ascending order of frequency
order(stations_2019_df$Freq)
##   [1]  13 237 233  20 355 232 300 347  82 123 236 150  86  88 246 247 160 345
##  [19] 187 222 309 318 234  46 225 260  14   8 162 361  34 161 214  93  99 343
##  [37] 342 155 315   5  69 306  51 133 154 258 311 322 196 330  33  92  85  39
##  [55] 288 156 354 223 165 216 193   2  40 333 313 159 114 332  47 131 335  27
##  [73] 274 102 146  41 145 157 310 104 316 334 182  26 340 360 273 242 254  98
##  [91] 136  66 151 134 294 125  54 277 281  31 143 204 312  50 148 138  12 265
## [109] 287   9 329  73 344  68 140 135 249 121 262  43 231 314 348  15  10 336
## [127] 132 158 115 137 181  58 110 362  57 279  32 224 107 270 153 101  45  90
## [145] 139 350 127  28  87 217 124 351  67 352 198  18 266  72 243 280 245 152
## [163] 226 144  42 189 308 111 177 295 255 290 149 105 186  17  70 346 304   7
## [181] 141  49  23 219 363 213 235 211 197  16 220 286  64 221 202 194 303 184
## [199]  74 215  11  37 264  65 239 240 167  75 285 164  19 349 116 293 147 190
## [217] 326 267 188   3 195 206  94  71 299 261 112 250 106 324 283 251 191 212
## [235] 358  52 359 185 317 272 122 305 166 176 296 253  56  63 208 289 117 100
## [253] 113 292 323 302 268 169 179 327 109 103 142  38 337   4 168 339 259  35
## [271] 118 301  44 269 297 271 338  25 171 199  76 357  91 276 126 248  59 178
## [289]   1 356 320  95 257 328  77 319 307 175 128 275  78  21 203  79 325  81
## [307] 291 180 192 321  55 183 163 174 170 129   6 172 263 119  96 209 341 278
## [325] 130 201  29  24 282 256 207  60  53 210 244 284  62 238 205 331 353  48
## [343] 218 108 120  80  89  36  83  30 252  61  97 230 228 173 241 200  22 298
## [361] 229  84 227
# select the top 36
top36_2019 <- stations_2019_df[c(24, 282, 256, 207, 60, 53, 210, 244, 284, 62, 238, 205, 331, 353, 48, 218, 108, 120, 80, 89, 36, 83, 30, 252, 61, 97, 230, 228, 173, 241, 200, 22, 298, 229, 84, 227),]
# rename station name column
top36_2019 <- rename(top36_2019,
                           "start_station_name" = "Var1")
# merge data frame with longitude and latitude coordinates from data_2019
top36_2019 <- left_join(top36_2019, unique_start, by = "start_station_name")

# somehow I got 41 stations from this, but let's plot them anyway 
leaflet() %>% addTiles() %>% 
  addCircleMarkers(lng = top36_2019$start_long, lat = top36_2019$start_lat)
# end stations 2019
unique_end %>% dplyr::select(end_station_name) %>% unique() %>%  count() # 360
## # A tibble: 1 × 1
##       n
##   <int>
## 1   360
# get the station names
stations_2019_end <- table(data_2019$end_station_name)
# convert to tibble
stations_2019_df_end <- as.data.frame(stations_2019_end) %>% as_tibble()
# get the indices in ascending order of frequency
order(stations_2019_df_end$Freq)
##   [1]  13 237 341 233  20 232 356 348 300  82  86  88 123 150 160 236 346 247
##  [19] 187 246 234 318 309 225 222  46 155  34  14  69 260 315 344 362 214   8
##  [37]  93 162  99 156 161 343 133 154   5 306  51 258 322 355 288 193  33 330
##  [55] 165 196  39  85 311  92 313  40 332 333  47 216 223 102 114 131 146 121
##  [73] 157 335  27 159 145 104 294  41 242 182 316 274 334 310 340  98 361 273
##  [91]  26 136 125  66 254 151 134  54   2 143 277 349  31 314 281 138 204  50
## [109] 148 312  12 140 287 231   9 265  68 135 345 329  73  43 262 110 249  10
## [127]  15 336 132 158  58  67 137 181 115  32  57 279 347 363 107 224 270 295
## [145] 101  45  90 139 280 111 153 144 352 217  87 351 255 266 353 127  70  42
## [163] 198 226 124 152 243  18 177 189 186 245 290 308  49 211  17  37  72 286
## [181] 197 213 304 364 141 219  23  64 105   7  19 235  16 220 149  28 293 202
## [199]  75 221 285 184 303 194  11 215 240 188  65 167  74 164 112 350 261 239
## [217] 147 116 267 326   3  94 195 113 250 206  71 212 360 264 283 324 317 251
## [235] 106 190  52 191 185 359 122 299 176 253 305 339 296  63 272 323 208 302
## [253] 169 292 179 289 109  56 100 327 268  38 142 118 103 337 297 117   4 168
## [271]  44  35 269 259 301 271 166  91 171 126 276 199  76 338 178 358 357  77
## [289] 248 163 320   1 319  59 257  25  95 175   6 192 307 328  79  21  78 275
## [307] 203 183 325 291 128 321  81 180 174 263 342 172  55  96 119 256 129 170
## [325]  29 201 278 282 207  24 130 284 331  60  53 244 209  62 210 238 205 218
## [343]  48 354 108  80  89  36 252 120  30  83  61  97 230 200 228 173 298 229
## [361] 241  22  84 227
# select the top 36
top36_2019_end <- stations_2019_df_end[c(207, 24, 130, 284, 331, 60,  53, 244, 209,  62, 210, 238, 205, 218,  48, 354, 108,  80,  89,  36, 252, 120,  30,  83,  61,  97, 230, 200, 228, 173, 298, 229, 241,  22,  84, 227),]
# rename station name column
top36_2019_end <- rename(top36_2019_end,
                           "end_station_name" = "Var1")
# merge data frame with longitude and latitude coordinates from data_2019
top36_2019_end <- left_join(top36_2019_end, unique_end, by = "end_station_name")

# transform to sf
coords_start_2019 <- top36_2019 %>% st_as_sf(coords = c("start_long", "start_lat"), crs = 4326)

coords_end_2019 <- top36_2019_end %>% st_as_sf(coords = c("end_long", "end_lat"), crs = 4326)

# get the coordinates
coords_2019 <- cbind(st_coordinates(coords_start_2019$geometry),st_coordinates(coords_end_2019$geometry))

# get the routes
linestrings_2019 = st_sfc(
     lapply(1:nrow(coords_2019),
           function(i){
             st_linestring(matrix(coords_2019[i,],ncol=2,byrow=TRUE))
           }))
# give them a crs
linestrings_2019 <- linestrings_2019 %>% st_set_crs(4326)
plot(linestrings_2019)

# get average length of linestring
st_length(linestrings_2019) %>% mean() # 1494.143 m = 1.5 km
## 1494.143 [m]
# plot the routes on a map
leaflet(linestrings_2019) %>% addTiles() %>% 
  setView(lng = -71.08083, lat = 42.361145, zoom = 13) %>% 
  addPolylines() %>% 
  addCircleMarkers(lng = top36_2019_end$end_long, lat = top36_2019_end$end_lat, color = "red", popup = paste0("<strong>End station: </strong>", top36_2019_end$end_station_name)) %>% 
  addCircleMarkers(lng = top36_2019$start_long, lat = top36_2019$start_lat, popup = paste0("<strong>Start station: </strong>", top36_2019$start_station_name)) %>% addScaleBar()

Let’s do the same for the top routes in 2020

# start stations 2020
unique_start_2020 %>% dplyr::select(start_station_name) %>% unique() %>%  count() # 384
## # A tibble: 1 × 1
##       n
##   <int>
## 1   384
stations_2020_start <- table(data_2020$start_station_name)

stations_2020_df_start <- as.data.frame(stations_2020_start) %>% as_tibble()

order(stations_2020_df_start$Freq)
##   [1] 253  33 288  49 287 208 382 215  95 134  34 184 232 364  30 227 381 349
##  [19] 263   6 231 281 168 154 159 176 118  73 174 250 340 229 277 234 329 360
##  [37]   2  48  25 238  92 367 375 257  75  18 265 158 362 138 320 365  94 311
##  [55] 252 109 374 338  62 186 167 293  50  42 107 353 221 124 358 150 331  53
##  [73] 239  13 337 286  88 166 292  99 259 245  36  43  24 334 307 378 149  74
##  [91] 352 155  57 197 172 384 264 140  98  26 136 163 164 314 160  71   8 332
## [109] 170 142 205  70 244 296 147 335 199 129 191 192 300 330 350 380 361  10
## [127] 249 161 333  60  31 260 126 354 203 241 143 104  12 152  52  44  37 326
## [145] 267 315 141  72 279 363 144  91  80 371 139 273 128 304  76 289  54 298
## [163] 308 196 157 114 200 385  93 224 137 207 242 272 216 145 368  78 370 383
## [181] 226 228  77   7 282 256  61 235 306 131 240 283 162  32 262 366 299 165
## [199] 291 251  17 386  16  27 206 313  21  81 302  15  39 171 372 346 106  40
## [217] 379 201  69 321 225 271  59  14  87  68 322   3 194 148 369  46 187 110
## [235]  41   4 218 151  11 117 278 198 121 230 323 336 285 173 309 185 120 325
## [253] 209 122 220 105 175 319  23 119 204 116 169 294 328 156 276 189 324 101
## [271] 327 344 180 355  79 111 153 108 348 343  58 310 317  82  55 376 177 269
## [289]  86   9  47 290  84 133 146 341 268 183 100 179 305 312 237  83 357 195
## [307] 255 280 236  67 345 113 188 347  97  63 316 178  56 213 202 181 303 212
## [325] 339  66 377 359  28 284 275 115  29 130  19 342 211   1 217 193  45  51
## [343] 135   5  35 125 295 356  22 132 351 190 270 318  64 233 266 102 123 297
## [361] 214 301 373 112 261 210 219  85  89 223 222 254 258 248 274 247 182  38
## [379] 246 127  65  20 103 243  96  90
top38_2020_start <- stations_2020_df_start[c(22, 132, 351, 190, 270, 318, 64, 233, 266, 102, 123, 297, 214, 301, 373, 112, 261, 210, 219, 85, 89, 223, 222, 254, 258, 248, 274, 247, 182, 38, 246, 127, 65, 20, 103, 243, 96, 90),]

top38_2020_start <- top38_2020_start %>% rename("start_station_name" = "Var1")

top38_2020_start <- left_join(top38_2020_start, unique_start_2020, by = "start_station_name")

# end stations 2020
unique_end_2020 %>% dplyr::select(end_station_name) %>% unique() %>% count() # 385
## # A tibble: 1 × 1
##       n
##   <int>
## 1   385
stations_2020_end <- table(data_2020$end_station_name)

stations_2020_df_end <- as.data.frame(stations_2020_end) %>% as_tibble()

order(stations_2020_df_end$Freq)
##   [1] 250 254  33 289 288 383  49 215 208 134  95  34 232 184 365  30 382 350
##  [19] 159 264 231 227 282 174 168 176   6 154 118  73 341 251 330 229 234 361
##  [37] 278 368  92  48  75 338 366 238  25 376 363 158 138  94 321 258  18 312
##  [55] 266 253 109 375 339  62 186 167  50 359  42 107 294 354 221 124 260 150
##  [73] 332  53 245  13  36 335   2 239 287  88  43 166 379  57 308  99  24 149
##  [91] 172 353 197 160 385  74 293 265 140 155  98 315 163 136  26 170 333   8
## [109]  71 147 142 164 126 244 199 205 362  70 297  72 161 336 129 249 191 192
## [127] 381  10 351 334 301  31 331 316 261 241 355 114  52  80 203 143 104  60
## [145]  76  12  44 152 268 280  37 305 327 274 144 141 364 367  91 139 309 128
## [163] 196 290 372  32 224  54 299 369 386 157 200  93 137 226 145 216 314 242
## [181] 273  78 207 257 228 371 283 300  61 384   7  77 284 235 131 307 240 162
## [199] 252 206 165  17  40 292 387  16 303  21 347 263 272  81 106  68  15  27
## [217] 171 373 380  39 225   3 201 322  87  14  46  59 117 148 323  11  69 194
## [235] 187 279 370  41 151   4 198 337 218 286 230 310 121 116 169 324 185 110
## [253] 120 326 122 209 173 220 175  23 119 105 295 320 189 356 204 180 329 156
## [271] 325 277 328 345 101 269 111  79 344 318 108 153  82 311 113 270 358  47
## [289] 100 183 291 146 177  84 377 306  86  83 237 313  58 349 342  55   9 133
## [307] 281 348 195 236 360 256 188  67 346  97 213 178 340 212  56 115 179 202
## [325]  66 317  63 181 285 130 378 276  28  29  19 343   5 304 217 211  51  45
## [343]   1 135  35  22 296 352 357 125 132 193 271 190 233 319  64 267 123 102
## [361] 214 302 298 112 210 374 262 219  85 247  89 223 248 275 255 246 222  38
## [379] 259 127 182  65  20 243 103  96  90
top38_2020_end <- stations_2020_df_end[c(125, 132, 193, 271, 190, 233, 319, 64, 267, 123, 102, 214, 302, 298, 112, 210, 374, 262, 219, 85, 247, 89, 223, 248, 275, 255, 246, 222,  38, 259, 127, 182, 65, 20, 243, 103, 96, 90),]

top38_2020_end <- top38_2020_end %>% rename("end_station_name" = "Var1")

top38_2020_end <- left_join(top38_2020_end, unique_end_2020, by = "end_station_name")
coords_start_2020 <- top38_2020_start %>% st_as_sf(coords = c("start_long", "start_lat"), crs = 4326)

coords_end_2020 <- top38_2020_end %>% st_as_sf(coords = c("end_long", "end_lat"), crs = 4326)

coords_2020 <- cbind(st_coordinates(coords_start_2020$geometry),st_coordinates(coords_end_2020$geometry))

linestrings_2020 = st_sfc(
     lapply(1:nrow(coords_2020),
           function(i){
             st_linestring(matrix(coords_2020[i,],ncol=2,byrow=TRUE))
           }))

linestrings_2020 <- linestrings_2020 %>% st_set_crs(4326)
plot(linestrings_2020)

st_length(linestrings_2020) %>% mean() # 2137.295 m = 2,1 km
## 2137.295 [m]
leaflet(linestrings_2020) %>% addTiles() %>% 
  setView(lng = -71.08083, lat = 42.361145, zoom = 13) %>% 
  addPolylines() %>% 
  addCircleMarkers(lng = top38_2020_end$end_long, lat = top38_2020_end$end_lat, color = "red", popup = paste0("<strong>End station: </strong>", top36_2019_end$end_station_name)) %>% 
  addCircleMarkers(lng = top38_2020_start$start_long, lat = top38_2020_start$start_lat, popup = paste0("<strong>Start station: </strong>", top36_2019$start_station_name)) %>% addScaleBar()

Seeing as I have plotted the routes in euclidian space, they do not resemble the real routes the users took very much. However, we are able to say something about where the users move between. It does not seem to be the case that there is a trend from north to south or from vest to east. Since most dots are purple, it seems that the users move within this space in all directions. However, very tentitively, the two red dots are below the river which could indicate a from north to south migration, which one could explore with more points and routes.

Elevation

Let’s explore the elevation of the city which also can have an impact of the movement opportunities the city offers.

# read in elevation raster
elevation <- raster("../data/_ags_ff2f2033_740d_47f5_8f3e_61b68c7d95f1.tif")

# read in the neighborhoods 
neighborhoods <- st_read("../data/Boston_Neighborhoods/Boston_Neighborhoods.shp")
## Reading layer `Boston_Neighborhoods' from data source 
##   `/Users/mettehejberg/Documents/6. semester engelsk /Spatial Analytics/exam/data/Boston_Neighborhoods/Boston_Neighborhoods.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 26 features and 7 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 739715.3 ymin: 2908294 xmax: 812255.1 ymax: 2970086
## Projected CRS: NAD83 / Massachusetts Mainland (ftUS)
# reproject to get it on the map
projected_neighborhoods <- st_transform(neighborhoods, 4326)

# define colors 
pal <- colorNumeric(c("#A52A2A", "#90A926", "#55EA13"), values(elevation),
  na.color = "transparent")
# plot raster on map 
leaflet(projected_neighborhoods) %>% addTiles() %>% setView(lng = -71.067083, lat = 42.330145, zoom = 11.5) %>% addRasterImage(elevation, colors = pal, opacity = 0.8) %>% addLegend(pal = pal, values = values(elevation),
    title = "Elevation in meters above sea level") %>% 
  addPolygons(stroke = TRUE, fillColor = "transparent", color = "black", weight = 1) %>% addCircleMarkers(lng = top36_2019$start_long, lat = top36_2019$start_lat, color = "black") %>% addScaleBar()
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO", prefer_proj =
## prefer_proj): Discarded ellps WGS 84 in Proj4 definition: +proj=merc +a=6378137
## +b=6378137 +lat_ts=0 +lon_0=0 +x_0=0 +y_0=0 +k=1 +units=m +nadgrids=@null
## +wktext +no_defs +type=crs
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO", prefer_proj =
## prefer_proj): Discarded datum World Geodetic System 1984 in Proj4 definition
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO", prefer_proj =
## prefer_proj): Discarded ellps WGS 84 in Proj4 definition: +proj=merc +a=6378137
## +b=6378137 +lat_ts=0 +lon_0=0 +x_0=0 +y_0=0 +k=1 +units=m +nadgrids=@null
## +wktext +no_defs +type=crs
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO", prefer_proj =
## prefer_proj): Discarded datum World Geodetic System 1984 in Proj4 definition

There is a lot of elevation in the southern part of the city which probably also partly can explain the fact that there are less points there. The most used stations are centered in downtown Boston where there is little elevation.

Extract elevation for stations

# remove outliers if there are any
points <- unique_start %>% 
  filter(! is.na(start_long)) %>% 
  filter(! is.na(start_lat))

# transform points to sf object
points_sf <- st_as_sf(points, coords = c("start_long", "start_lat"), crs = 4326)

# extract the elevations of the stations
points_sf$elevation  <- raster::extract(elevation, points_sf)
## Warning in .local(x, y, ...): Transforming SpatialPoints to the crs of the
## Raster
# Look at the points and extraction results - add polygon on top for scale
plot(points_sf["elevation"])

# transform crs of points and neighborhoods to that of elevation raster
points_elevation <- st_transform(points_sf, crs = crs(elevation, asText = TRUE))
neighborhoods_elevation <- st_transform(neighborhoods, crs = crs(elevation, asText = TRUE))

# plot the three object together
plot(elevation)
plot(points_elevation$geometry, add = TRUE)
plot(neighborhoods_elevation$geometry, border = "black", add = TRUE)

# histogram of frequency of elevation
ggplot(points_sf, aes(x = elevation)) + geom_histogram() + labs(x="Elevation", y = "Frequency") + theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 5 rows containing non-finite values (stat_bin).

ggsave("../data_outputs/elevation_histogram.jpg")
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 5 rows containing non-finite values (stat_bin).

The first plot of the elevation of the stations show that there are many more stations located in areas with low elevation, and the second plot with the neighborhood polygon overlay confirms this.

We can see from the histogram that there are more points with lower elevation than higher, and from the plots that there is a trend towards the north of lower elevation and higher elevation towards the south. The histogram also tells us that there are no points in the areas with the highest elevation. The highest elevation for the points is 50 m. Therefore, Boston does have variation in elevation, however the users of the bikes do not move within the spaces of most elevation.

Get more routes - all unique stations of 2019

coords_start_2019_all <- unique_start %>% st_as_sf(coords = c("start_long", "start_lat"), crs = 4326)

coords_end_2019_all <- unique_end %>% st_as_sf(coords = c("end_long", "end_lat"), crs = 4326)

# find differences between the two objects
end <- coords_end_2019_all$end_station_name %>% as_tibble()
start <- coords_start_2019_all$start_station_name %>% as_tibble()

diffdf(end, start)
## Warning in diffdf(end, start): 
## There are rows in BASE that are not in COMPARE !!
## Not all Values Compared Equal
## Differences found between the objects!
## 
## A summary is given below.
## 
## There are rows in BASE that are not in COMPARE !!
## All rows are shown in table below
## 
##   ===============
##    ..ROWNUMBER.. 
##   ---------------
##         407      
##         408      
##   ---------------
## 
## Not all Values Compared Equal
## All rows are shown in table below
## 
##   =============================
##    Variable  No of Differences 
##   -----------------------------
##     value           396        
##   -----------------------------
## 
## 
## First 10 of 396 rows are shown in table below
## 
##   ===============================================================================================
##    VARIABLE  ..ROWNUMBER..                BASE                              COMPARE              
##   -----------------------------------------------------------------------------------------------
##     value          1                 Kenmore Square               Dartmouth St at Newbury St     
##     value          2          MIT at Mass Ave / Amherst St     MIT Stata Center at Vassar St ... 
##     value          3        Verizon Innovation Hub 10 Ware...  Inman Square at Springfield St... 
##     value          4        Sidney Research Campus/ Erie S...           Third at Binney          
##     value          5        Harvard Law School at Mass Ave...  Verizon Innovation Hub 10 Ware... 
##     value          6        Inman Square at Springfield St...           University Park          
##     value          7        Lower Cambridgeport at Magazin...  One Kendall Square at Hampshir... 
##     value          8               Elm St at White St          359 Broadway - Broadway at Fay... 
##     value          9        Packard's Corner - Commonwealt...  Prudential Center - 101 Huntin... 
##     value         10             Sydney St at Carson St                     Encore               
##   -----------------------------------------------------------------------------------------------
# remove rows with differences
coords_end_2019_all <- coords_end_2019_all[c(0-406),]
coords_end_2019_all <- coords_end_2019_all[c(0-406),]

coords_2019_all <- cbind(st_coordinates(coords_start_2019_all$geometry), st_coordinates(coords_end_2019_all$geometry))

# get all routes in 2019
linestrings_2019_all <- st_sfc(
  lapply(1:nrow(coords_2019_all),
         function(i){
           st_linestring(matrix(coords_2019_all[i,], ncol=2, byrow=TRUE))
         })
)

plot(linestrings_2019_all)

linestrings_2019_all <- linestrings_2019_all %>% st_set_crs(4326)

lengths <- st_length(linestrings_2019_all) 
lengths %>% mean() # 4811.787 == 4,8 km 
## 4811.787 [m]
lengths %>% median() # 4411.232 == 4.4 km
## 4411.232 [m]
#lengths <- lengths %>% as_tibble()
lengths <- lengths %>% as.numeric()
lengths <- lengths %>% as_tibble()

ggplot(lengths, aes(x = value)) + geom_histogram() + labs(x="Length in meters", y = "Frequency") + theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggsave("../data_outputs/routes_histogram.jpg")
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The histogram visually confirms that the median length closely matches the mean length.